RWhalen Dillon
December 9, 2014
This is a slidy presentation generated using R Markdown in
RR is optimized for vectorization (what the heck does that mean?)
Generally avoid looping operations:
data <- seq(1, 10000, by = 1)
data_squared <- NULL
system.time(
for(i in data){
data_squared[i] <- data[i]^2
})
## user system elapsed
## 0.267 0.073 0.345
# Vectorization is faster
system.time(data_squared <- data^2)
## user system elapsed
## 0 0 0
Rmy_data1 <- read.csv("pathname/mydata.csv")# Read csv file
my_data2 <- read.table("pathname/mydata.txt")# Read text file, other delimiter
RR - multiple filesfiles <- list.files("Rain_Gauge/2_RG_EXPORTS", pattern="*.csv",
full.names=TRUE)
is.vector(files)
## [1] TRUE
class(files)
## [1] "character"
length(files)
## [1] 112
head(files, 3)
## [1] "Rain_Gauge/2_RG_EXPORTS/annadel_day_hr_2004.csv"
## [2] "Rain_Gauge/2_RG_EXPORTS/annadel_day_hr_2005.csv"
## [3] "Rain_Gauge/2_RG_EXPORTS/annadel_day_hr_2006.csv"
R - multiple fileslibrary(plyr)# `ldply()` function reads a list, returns a data frame
library(data.table)# `fread()` function
rg_data <- ldply(files, function(i){fread(i)})
class(rg_data)
## [1] "data.frame"
head(rg_data, 3)
## id date time events daily_events hourly_events
## 1 annadel 11/12/2003 13:00:00 NA NA 0
## 2 annadel 11/12/2003 14:00:00 NA NA 0
## 3 annadel 11/12/2003 15:00:00 NA NA 0
str(rg_data)
## 'data.frame': 1174694 obs. of 6 variables:
## $ id : chr "annadel" "annadel" "annadel" "annadel" ...
## $ date : chr "11/12/2003" "11/12/2003" "11/12/2003" "11/12/2003" ...
## $ time : chr "13:00:00" "14:00:00" "15:00:00" "16:00:00" ...
## $ events : int NA NA NA NA NA NA NA NA NA NA ...
## $ daily_events : int NA NA NA NA NA NA NA NA NA NA ...
## $ hourly_events: int 0 0 0 0 0 0 0 0 0 0 ...
Join date and time columns into new variable date_time
rg_data$date_time <- paste(rg_data$date, rg_data$time, sep=" ")
class(rg_data$date_time)
## [1] "character"
Convert date_time into format interpretable by the computer (POSIX)
rg_data$date_time <- strptime(rg_data$date_time, format="%m/%d/%Y %H:%M:%S",
tz="UTC")
class(rg_data$date_time)
## [1] "POSIXlt" "POSIXt"
head(rg_data, 3)
## id date time events daily_events hourly_events
## 1 annadel 11/12/2003 13:00:00 NA NA 0
## 2 annadel 11/12/2003 14:00:00 NA NA 0
## 3 annadel 11/12/2003 15:00:00 NA NA 0
## date_time
## 1 2003-11-12 13:00:00
## 2 2003-11-12 14:00:00
## 3 2003-11-12 15:00:00
Create year, month, and day variables for grouping > - Many functions can’t handle POSIX formatted date/time
These functions come from the data.table package
rg_data$year <- year(rg_data$date_time)# extracts year
rg_data$month <- month(rg_data$date_time)# extracts month
rg_data$day <- mday(rg_data$date_time)# extracts day of month
head(rg_data, 3)
## id date time events daily_events hourly_events
## 1 annadel 11/12/2003 13:00:00 NA NA 0
## 2 annadel 11/12/2003 14:00:00 NA NA 0
## 3 annadel 11/12/2003 15:00:00 NA NA 0
## date_time year month day
## 1 2003-11-12 13:00:00 2003 11 12
## 2 2003-11-12 14:00:00 2003 11 12
## 3 2003-11-12 15:00:00 2003 11 12
library(dplyr)
dy_rg_data <- rg_data %>%
select(id, date, year, month, day, events) %>%
group_by(id, year, month, day) %>%
summarize(daily_events=length(events), daily_ppt=length(events)*0.01)
str(dy_rg_data)
## Classes 'grouped_df', 'tbl_df', 'tbl' and 'data.frame': 34870 obs. of 6 variables:
## $ id : chr "annadel" "annadel" "annadel" "annadel" ...
## $ year : int 2003 2003 2003 2003 2003 2003 2003 2003 2003 2003 ...
## $ month : int 11 11 11 11 11 11 11 11 11 11 ...
## $ day : int 12 13 14 15 16 17 18 19 20 21 ...
## $ daily_events: int 11 24 43 32 38 24 24 24 24 24 ...
## $ daily_ppt : num 0.11 0.24 0.43 0.32 0.38 0.24 0.24 0.24 0.24 0.24 ...
## - attr(*, "vars")=List of 3
## ..$ : symbol id
## ..$ : symbol year
## ..$ : symbol month
## - attr(*, "drop")= logi TRUE
dy_rg_data$date <- as.Date(
with(dy_rg_data, paste(as.character(year), as.character(month),
as.character(day), sep="/")),
format = "%Y/%m/%d")
class(dy_rg_data$date)
## [1] "Date"
summary(dy_rg_data)
## id year month day
## Length:34870 Min. :2003 Min. : 1.000 Min. : 1.00
## Class :character 1st Qu.:2005 1st Qu.: 4.000 1st Qu.: 8.00
## Mode :character Median :2008 Median : 7.000 Median :16.00
## Mean :2008 Mean : 6.613 Mean :15.73
## 3rd Qu.:2011 3rd Qu.:10.000 3rd Qu.:23.00
## Max. :2014 Max. :12.000 Max. :31.00
## daily_events daily_ppt date
## Min. : 2.00 Min. : 0.0200 Min. :2003-09-17
## 1st Qu.: 24.00 1st Qu.: 0.2400 1st Qu.:2005-10-17
## Median : 24.00 Median : 0.2400 Median :2008-08-10
## Mean : 33.69 Mean : 0.3369 Mean :2008-08-25
## 3rd Qu.: 24.00 3rd Qu.: 0.2400 3rd Qu.:2011-01-27
## Max. :6998.00 Max. :69.9800 Max. :2014-05-14
library(ggplot2)
qplot(date, daily_ppt, data = dy_rg_data, geom = c("point","line"),
ylab = "Daily rainfall (inches)", color = daily_ppt > 6)
Maybe a few outliers…
qplot(date, daily_ppt,
data = dy_rg_data %>% filter(daily_ppt < 6),
geom = c("point","line"), ylab = "Daily rainfall (inches)",
color = year) +
theme_bw()